/* ********************************************************************************* */
/* 4 ****************************** Inventor mobility ****************************** */
/* ********************************************************************************* */
/* Summary   : Code builds an unbalanced inventor firm year panel                    */
/* 			   and identify inventor movement                                        */
/* ********************************************************************************* */



	LIBNAME cow 'C:\cow';
	LIBNAME data 'C:\data';


************************;
* PATENT DATA FROM KPSS ;
************************;

	
* import patent-permno data for public firms from KPSS;
	proc import out=KPSS_2020_public FILE='C:\data\KPSS_2020_public.csv' dbms=csv replace ; run;	
	data KPSS_2020_public; set KPSS_2020_public; rename patent_num=patnum; rename filing_date=fdate; rename issue_date=idate; run;
	proc sort data=KPSS_2020_public; by patnum; run;
	data cow.KPSS_2020_public; set KPSS_2020_public; kpssrow=_n_; run;
	data KPSS_2020_public; set cow.KPSS_2020_public; run;

* get company name from CRSP links
  download SAS data file ccmxpf_lnkhist from WRDS CRSP at /wrds/crsp/sasdata/a_ccm ;
	proc sql; create table patnum_permno as select a.*, b.conm, b.state as statekpss, b.incorp as incorpkpss
	from KPSS_2020_public as a left join data.ccmxpf_lnkhist as b
	on a.permno=b.lpermno;
	quit;
	proc sort; by kpssrow; run;
	data patnum_permno; set patnum_permno; by kpssrow; if first.kpssrow; run;


*************************************;
* PATENT-ASSIGNEE DATA FROM PATVIEWS ;
*************************************;

* download disambiguated patent-assignee data from https://patentsview.org/download/data-download-tables 
  available from https://www.mikewoeppel.com/data > Assignees link 
  import patent-assignee data;
	proc import out=assignee_5m FILE='C:\data\assignee_5m.csv' dbms=csv replace ; run;	
	proc import out=assignee_6m FILE='C:\data\assignee_6m.csv' dbms=csv replace ; run;	
	proc import out=assignee_7m FILE='C:\data\assignee_7m.csv' dbms=csv replace ; run;	
	proc import out=assignee_8m FILE='C:\data\assignee_8m.csv' dbms=csv replace ; run;	
	proc import out=assignee_9m FILE='C:\data\assignee_9m.csv' dbms=csv replace ; run;	
	proc import out=assignee_10m FILE='C:\data\assignee_10m.csv' dbms=csv replace ; run;	
	proc import out=assignee_11m FILE='C:\data\assignee_11m.csv' dbms=csv replace ; run;	
	data assignee_all; set assignee_5m assignee_6m assignee_7m assignee_8m assignee_9m assignee_10m assignee_11m; run;
	proc sort data=assignee_all; by patnum assignee_id; run;
	proc sql; drop table assignee_5m, assignee_6m, assignee_7m, assignee_8m, assignee_9m, assignee_10m, assignee_11m; quit;

* check for double assignees for one patent;
	data a; set assignee_all; lag_patnum=lag(patnum); if lag_patnum=patnum then double=1; drop lag_:; run;
	proc sort; by patnum descending assignee_id; run;
	data assignee; set a; lag_patnum=lag(patnum); if lag_patnum=patnum then double=1; drop lag_:; run;
	proc sort data=assignee; by patnum assignee_id; run;
	proc sql; drop table assignee_all; quit;

* keep only US patents from companies (type2);
	data assignee_US2; set assignee; if (country="US" and type=2); run;

* combine KPSS patents and USPTO PatentView patents;
	data patnum_permno_assignee; set patnum_permno assignee_US2; proc sort; by patnum descending permno; run;


* combine assignee data with patent-permno data resolve unmatched double assignees;
	data patnum_permno_assignee; merge assignee patnum_permno; by patnum; 
	if (country="US" and type=2) or kpssrow~=.;
	if organization="" then organization=conm;
	if double=1 and permno~=.  then do;
	org=upcase(compress(lowcase(organization),"qwertyuiopasdfghjklzxcvbnm1234567890& ","k")); /* keep alphanumeric & space only*/
	%let vals = THE|LIMITED|LTD|LC|LLC|LP|LLP|PLLC|CORP|INCORPORATED|INC|IN|CORPORATION|CO|COMPANY|COMPANIES|AND|HOLDING|HOLDINGS; /* drop suffixes */
	regex = prxparse("s/\b(&vals.)\b//i"); /* /b signifies a word boundary, so it will remove the whole words only */
	call prxchange(regex,-1,org); drop regex;
	if substr(org,1,18)="AMERICAN TELEPHONE" or substr(org,1,14)="BELL TELEPHONE" or org="WESTERN ELECTRIC" then org="AT&T";
		checkscore=min(compged(conm,org),compged(org,conm));
	end;
	if (checkscore~=. and checkscore<500) or (org~="" and substr(conm,1,3)=substr(org,1,3)) then namematch=1; else namematch=0;
	if double=1 and country="US" and namematch=1 then match=1;
	if country="US" then US=1; 
	drop idate lat: long: county: state_fips location_id xi_: ; 
	run;
	proc sort; by kpssrow patnum descending match descending US checkscore; run;
	data patnum_permno_assignee; set patnum_permno_assignee; lag_kpssrow=lag(kpssrow); run;
	data patnum_permno_assignee; set patnum_permno_assignee; 
	if kpssrow~=. and lag_kpssrow=kpssrow then tobedelete=1; 
	drop checkscore conm org lag_kpssrow namematch match US;  
	run;

	proc sort data=patnum_permno_assignee; by patnum; run;
	data patnum_permno_assignee; set patnum_permno_assignee; patrow=_N_; run;



* import assignee data after matching names and crosschecking, save as csv data file org_id_final_US2;
	proc import out=cow.org_id_final_US2 FILE='C:\data\org_id_final_US2.csv' dbms=csv replace ; run;	


* merge org_id back to patnum_permco_assignee;
	data temp; set patnum_permno_assignee; * if kpssrow~=.; run;
	proc sql; create table patnum_permno_assignee_org_id as select * from temp as a left join cow.org_id_final_US2 as b 
	on a.country = b.country and a.type = b.type and a.organization = b.organization 
    and a.state = b.state and a.city = b.city and a.assignee_id = b.assignee_id ; quit;
	proc sort; by org_id_row; run;

* fill up org_id for permno in KPSS not identified in the name matching process;
	data temp; set patnum_permno_assignee_org_id; 
	if org_id=. then org_id=10000000; run;
	proc sort data=temp; by permno org_id country state city; run;
	data temp; set temp; by permno; retain neworg_id;
	if permno~=. and first.permno then neworg_id=org_id; run;
	data temp; set temp; if neworg_id=10000000 then neworg_id=.; run;
	data temp; set temp; if neworg_id~=. then org_id=neworg_id; drop neworg_id; 
	if org_id=10000000 then org_id=. ; run;
	proc sort; by patrow; run;

* fill up org_id for permno in KPSS without org_id;
	data org_id_miss; set temp; if permno~=. and org_id=.; keep permno org_id; run;
	proc sort nodupkey; by  permno org_id; run;
	data org_id_miss; set org_id_miss; org_id=_n_+1000000; rename org_id=org_id_miss;run;
	proc sql; create table master as select * from temp as a left join org_id_miss as b 
	on a.permno=b.permno; quit;
	proc sort; by patnum; run;
	data master; set master; if org_id=. and permno~=. then org_id=org_id_miss; 
	if org_id=. or tobedelete=1 then delete; 
	drop tobedelete organization org city country type assignee_id org_id_row org_id_miss; run;

	data master; set master; if permno=. then permno=org_id+1000000; run; 
	* permno>1000000 denotes private firms not in KPSS;
	proc sort data=master; by permno org_id ; run;
	data master; set master; run;

	proc sql; create table patnum_info as select * from master as a left join pat_allinfo as b 
	on a.patnum = b.patnum; quit;
	proc sort; by patrow; run;
	proc sql; drop table patnum_permno_assignee_org_id, master, temp; quit;
	
	data patnum_info; set patnum_info; 
	if year(fdate2)<1950 or year(fdate2)>2020 then fdate2=.; * drop all filing dates before 1950 and after 2020 due to data unreliability;
	if fdate=. then fdate=fdate2; format fdate date9.;
	drop fdate2; run;

* find the order of patent filed in each org;
	data temp; set patnum_info; proc sort; by permno fdate; run;
	data temp; set temp;  by permno fdate; retain patorder;
	lag_fdate=lag(fdate);
	if first.permno then patorder=0;
	if lag_fdate=fdate then patorder=patorder;
	patorder=patorder+1; 
	drop lag_:;	run;

* patnum_final is the final patent data file with crosschecked assignees;
	data cow.patnum_final; set temp; run;
	proc sort data=cow.patnum_final; by patnum; run; 



************************************************************************************;
************** CALCULATE GENERALITY & ORIGINALITY FOR EACH PATENT ******************;
************************************************************************************;

	
* get filing dates from patent info data available from https://www.mikewoeppel.com/data > General info link ;
	proc import out=info_5m (keep=patnum fdate idate) FILE='C:\data\info_5m.csv' dbms=csv replace ; run;	
	proc import out=info_6m (keep=patnum fdate idate) FILE='C:\data\info_6m.csv' dbms=csv replace ; run;	
	proc import out=info_7m (keep=patnum fdate idate) FILE='C:\data\info_7m.csv' dbms=csv replace ; run;	
	proc import out=info_8m (keep=patnum fdate idate) FILE='C:\data\info_8m.csv' dbms=csv replace ; run;	
	proc import out=info_9m (keep=patnum fdate idate) FILE='C:\data\info_9m.csv' dbms=csv replace ; run;	
	proc import out=info_10m (keep=patnum fdate idate) FILE='C:\data\info_10m.csv' dbms=csv replace ; run;	
	proc import out=info_11m (keep=patnum fdate idate) FILE='C:\data\info_11m.csv' dbms=csv replace ; run;	
	data cow.info; set info_5m info_6m info_7m info_8m info_9m info_10m info_11m; rename fdate=fdate2; format fdate date9.; run;
	proc sort data=cow.info; by patnum; run;
	proc sql; drop table info_5m, info_6m, info_7m, info_8m, info_9m, info_10m, info_11m; quit;


* import patent citation data available from https://www.mikewoeppel.com/data > Citations link;
	proc import out=us_cites_5m (keep=patnum patcite_num patcite_idate)  FILE='C:\data\us_cites_5m.csv' dbms=csv replace ; run;	
	proc import out=us_cites_6m (keep=patnum patcite_num patcite_idate) FILE='C:\data\us_cites_6m.csv' dbms=csv replace ; run;	
	proc import out=us_cites_7m (keep=patnum patcite_num patcite_idate) FILE='C:\data\us_cites_7m.csv' dbms=csv replace ; run;	
	proc import out=us_cites_8m (keep=patnum patcite_num patcite_idate) FILE='C:\data\us_cites_8m.csv' dbms=csv replace ; run;	
	proc import out=us_cites_9m (keep=patnum patcite_num patcite_idate) FILE='C:\data\us_cites_9m.csv' dbms=csv replace ; run;	
	proc import out=us_cites_10m (keep=patnum patcite_num patcite_idate) FILE='C:\data\us_cites_10m.csv' dbms=csv replace ; run;	
	proc import out=us_cites_11m (keep=patnum patcite_num patcite_idate) FILE='C:\data\us_cites_11m.csv' dbms=csv replace ; run;	
	data cow.us_cites; set us_cites_5m us_cites_6m us_cites_7m us_cites_8m us_cites_9m us_cites_10m us_cites_11m;
	patcitedate=datepart(patcite_idate); format patcitedate date9.; iyear=year(patcitedate); 
	patcite_num1=patcite_num*1; if patcite_num1~=.;  drop patcite_num; rename patcite_num1=patcite_num;
	drop patcite_idate patcitedate; run; 
	proc sort data=cow.us_cites; by patnum patcite_num; run;
	proc sql; drop table us_cites_5m, us_cites_6m, us_cites_7m, us_cites_8m, us_cites_9m, us_cites_10m, us_cites_11m; quit;


* import patent classification data
  Download detailed patent classification data uspc.tsv from https://patentsview.org/download/data-download-tables ;
	proc import out=cpc FILE='C:\data\uspc.tsv' dbms=tab replace ; run;	
	data cpc; set cpc; keep patent_id section_id group_id sequence category;
	rename patent_id=patnum; rename section_id=patgroup; rename group_id=class; 
	proc sort; by patnum sequence; run;
	data cpc; set cpc; if category="inventional" then cat="I"; else cat="A"; 
	if patgroup="" then patgroup="O";
	patnum1=patnum*1; drop patnum; rename patnum1=patnum;
	drop sequence category; 
	proc sort nodupkey; by patnum patgroup class cat; run;
	data cow.cpc; set cpc;  run;




*** CALCULATE CITATIONS -- ADJUSTED FOR TRUNCATION LAGS ***; 
*** 2 METHODS: AVG BY ISSUE/FILING YEAR & AVG BY ISSUE/FILING YEAR AND CLASS ***;


* count number of forward looking cites for each patent (number of cites the patent gets);
	data patcite_num; set cow.us_cites; citeunit=1; keep patcite_num citeunit; run;
	proc sort data=patcite_num; by patcite_num; run;
	proc means noprint data=patcite_num; by patcite_num; var citeunit;
	output out=cites1 (drop=_:) sum=cites_raw ; run;	
	proc sql; drop table patcite_num; quit;

* count number of forward looking cites for each patent within 5 years from grant date;
	
	* get filing date for patnum;
	data patnum_patcite_num; set cow.us_cites; keep patnum patcite_num iyear; run;
	data patnum_fyear; set cow.info; patnum_fyear=year(fdate2); keep patnum patnum_fyear ; run;
	proc sql; create table patcite_num_patnum_fyear as select * from patnum_patcite_num as a left join patnum_fyear as b 
	on a.patnum=b.patnum; quit; 

	* get issue date of patcite_num;
	data patcite_iyear; set cow.info; rename patnum=patcite_num; patcite_iyear=year(idate); 
	keep patnum patcite_iyear ; run;
	proc sql; create table patcite_num_patnum_fiyear as select * from patcite_num_patnum_fyear as a left join patcite_iyear as b 
	on a.patcite_num=b.patcite_num; quit; 

	proc sql; drop table patcite_num_patnum_fyear; quit;

	* keep only patnum within 5 years of patcite_num;
	data patcites_year5; set patcite_num_patnum_fiyear; 
	if patcite_iyear=. then patcite_iyear=iyear;
	if patcite_iyear<=patnum_fyear<=patcite_iyear+5; run;
	proc sort; by patcite_num patcite_iyear patnum_fyear; run;

	proc means noprint data=patcites_year5; by patcite_num; var patnum;
	output out=cites5 (drop=_:) n=cites_raw5 ; run;	
	proc sql; drop table patcite_num_patnum_fiyear, patcites_year5, patnum_patcite_num, patnum_fyear, patcite_iyear; quit;

	data cites; merge cites1 cites5; by patcite_num; run;

* count number of patents cited in each patent (number of patents that the patent cites);
	data patnum; set cow.us_cites; citeunit=1; keep patnum citeunit; run;
	proc sort data=patnum; by patnum; run;
	proc means noprint data=patnum; by patnum; var citeunit;
	output out=citing (drop=_:) sum=citing_raw ; run;	
	proc sql; drop table patnum; quit;

* get info for patcite;
	data infopat; set cow.info; rename patnum=patcite_num; pat_iyear=year(idate); pat_fyear=year(fdate2); 
	keep patnum pat_iyear pat_fyear; run;

* get grant year of patcite;
	proc sql; create table patcites_year as select * from cites as a left join infopat as b 
	on a.patcite_num=b.patcite_num; quit; 
	proc sql; drop table infopat; quit;

* get class of patcite_num;
	data cpc; set cow.cpc; rename patnum=patcite_num; keep patnum patgroup; if cat="I"; 
	proc sort nodupkey; by patcite_num patgroup; run;
	proc sql; create table patcites_year_class as select a.*, b.patgroup from patcites_year as a left join cpc as b 
	on a.patcite_num=b.patcite_num; quit; 
	data patcites_year_class ; set patcites_year_class;	if patgroup="" then patgroup="O"; run;

* 1A. calculate the average number of cites of patents in each class in each issue year;
	proc sort data=patcites_year_class; by pat_iyear patgroup; run;

	proc means noprint data=patcites_year_class; by pat_iyear; var cites_raw cites_raw5;
	output out=avg_cites_iyear (drop=_:) mean=avg_cites_iyear avg_cites5_iyear sum=total_cites_iyear total_cites5_iyear; run;
	proc means noprint data=patcites_year_class; by pat_iyear patgroup; var cites_raw cites_raw5;
	output out=avg_cites_iyear_class (drop=_:) mean=avg_cites_iyear_class avg_cites5_iyear_class sum=total_cites_iyear_class total_cites5_iyear_class; run;

	proc sql; create table cites_iyear_avg as select a.*, b.avg_cites_iyear, b.total_cites_iyear, b.avg_cites5_iyear, b.total_cites5_iyear
	from patcites_year_class as a left join avg_cites_iyear as b 
	on a.pat_iyear=b.pat_iyear ; quit; 
	proc sql; create table cites_iyear_class_avg as select a.*, b.avg_cites_iyear_class, b.total_cites_iyear_class, b.avg_cites5_iyear_class, b.total_cites5_iyear_class
	from cites_iyear_avg as a left join avg_cites_iyear_class as b 
	on a.pat_iyear=b.pat_iyear and a.patgroup=b.patgroup; quit; 
	proc sort; by patcite_num pat_iyear ; run;
	proc means noprint data=cites_iyear_class_avg; by patcite_num pat_iyear cites_raw cites_raw5; 
	var avg_cites_iyear avg_cites_iyear_class total_cites_iyear total_cites_iyear_class
		avg_cites5_iyear avg_cites5_iyear_class total_cites5_iyear total_cites5_iyear_class;
	output out=avg_cites_iyear_avgclass (drop=_:) 
	mean=avg_cites_iyear avg_cites_iyear_class total_cites_iyear total_cites_iyear_class
		 avg_cites5_iyear avg_cites5_iyear_class total_cites5_iyear total_cites5_iyear_class; run;

	proc sql; drop table cites_iyear_class_avg, avg_cites_iyear_class, Avg_cites_iyear, Cites_iyear_avg; quit;

* 1B. calculate the average number of cites of patents in each class in each filing year;
	proc sort data=patcites_year_class; by pat_fyear patgroup; run;

	proc means noprint data=patcites_year_class; by pat_fyear; var cites_raw cites_raw5;
	output out=avg_cites_fyear (drop=_:) mean=avg_cites_fyear avg_cites5_fyear sum=total_cites_fyear total_cites5_fyear; run;
	proc means noprint data=patcites_year_class; by pat_fyear patgroup; var cites_raw cites_raw5;
	output out=avg_cites_fyear_class (drop=_:) mean=avg_cites_fyear_class avg_cites5_fyear_class sum=total_cites_fyear_class total_cites5_fyear_class; run;

	proc sql; create table cites_fyear_avg as select a.*, b.avg_cites_fyear, b.total_cites_fyear, b.avg_cites5_fyear, b.total_cites5_fyear
	from patcites_year_class as a left join avg_cites_fyear as b 
	on a.pat_fyear=b.pat_fyear ; quit; 
	proc sql; create table cites_fyear_class_avg as select a.*, b.avg_cites_fyear_class, b.total_cites_fyear_class, b.avg_cites5_fyear_class, b.total_cites5_fyear_class    
	from cites_fyear_avg as a left join avg_cites_fyear_class as b 
	on a.pat_fyear=b.pat_fyear and a.patgroup=b.patgroup; quit; 
	proc sort; by patcite_num pat_fyear ; run;
	proc means noprint data=cites_fyear_class_avg; by patcite_num pat_fyear cites_raw cites_raw5; 
	var avg_cites_fyear avg_cites_fyear_class total_cites_fyear total_cites_fyear_class 
		avg_cites5_fyear avg_cites5_fyear_class total_cites5_fyear total_cites5_fyear_class;
	output out=avg_cites_fyear_avgclass (drop=_:) 
	mean=avg_cites_fyear avg_cites_fyear_class total_cites_fyear total_cites_fyear_class
	     avg_cites5_fyear avg_cites5_fyear_class total_cites5_fyear total_cites5_fyear_class; run;

	proc sql; drop table cites_fyear_class_avg, avg_cites_fyear_class, Avg_cites_fyear, Cites_fyear_avg; quit;


* 2. calculate the average number of patents filed in each class in each issue year;

	data infopat; set cow.info; pat_iyear=year(idate); pat_fyear=year(fdate2); 
	keep patnum pat_iyear pat_fyear; run;

* get class of patnum;
	data cpc; set cow.cpc; keep patnum patgroup; if cat="I"; 
	proc sort nodupkey; by patnum patgroup; run;
	proc sql; create table patnum_year_class as select a.*, b.patgroup from infopat as a left join cpc as b 
	on a.patnum=b.patnum; quit; 
	data patnum_year_class ; set patnum_year_class;	if patgroup="" then patgroup="O"; run;

	data patnum_year_class; set patnum_year_class; count=1; run;
	proc sort data=patnum_year_class; by pat_fyear patgroup; run;
	proc means noprint data=patnum_year_class; by pat_fyear patgroup; var count;
	output out=patcount_fyear_class (drop=_:) n=patcount_fyear_class; run;

* merge back to cites and get average for pat having more than one class;
	proc sql; create table cites_fyear_class_avg as select a.*, b.patcount_fyear_class 
	from patcites_year_class as a left join patcount_fyear_class as b 
	on a.pat_fyear=b.pat_fyear and a.patgroup=b.patgroup; quit; 
	proc sort; by patcite_num pat_fyear ; run;
	proc means noprint data=cites_fyear_class_avg; by patcite_num pat_fyear cites_raw cites_raw5; var patcount_fyear_class;
	output out=avg_patcount_fyear_avgclass (drop=_:) mean=avg_patcount_fyear_class; run;

	data cites_all; merge avg_cites_iyear_avgclass avg_cites_fyear_avgclass avg_patcount_fyear_avgclass; by patcite_num; 
	mean_patcount_fyear_class=avg_patcount_fyear_class;
	cites_countclassadj=cites_raw/avg_patcount_fyear_class; 
	cites_fclassadj=cites_raw/avg_cites_fyear_class; 
	cites_fyearadj=cites_raw/avg_cites_fyear; 
	cites_fclassadjt=cites_raw/total_cites_fyear_class; 
	cites_fyearadjt=cites_raw/total_cites_fyear; 
	cites5_countclassadj=cites_raw5/avg_patcount_fyear_class; 
	cites5_fclassadj=cites_raw5/avg_cites5_fyear_class; 
	cites5_fyearadj=cites_raw5/avg_cites5_fyear; 
	cites5_fclassadjt=cites_raw5/total_cites5_fyear_class; 
	cites5_fyearadjt=cites_raw5/total_cites5_fyear; 
	drop pat_iyear pat_fyear avg_: total_:; run;

	proc sql; drop table patcites_year, patcites_year_class, patcount_fyear_class, patnum_year_class, cites_fyear_class_avg, avg_patcount_fyear_avgclass, avg_cites_fyear_avgclass, avg_cites_iyear_avgclass; quit;



*** CALCULATE GENERALITY (One minus the HHI of the number of patents that cite you across classes) ***; 
	
* get info for pat;
	data infopat; set cow.info; rename patnum=patcite_num; pat_iyear=year(idate); pat_fyear=year(fdate2); 
	keep patnum pat_iyear pat_fyear; run;

* get filing year of patcite_num;
	data patnum; set cow.us_cites; keep patnum patcite_num; run;
	proc sql; create table patcite_num_iyear as select a.patnum, a.patcite_num, b.pat_fyear 
	from patnum as a left join infopat as b 
	on a.patcite_num=b.patcite_num; quit; 
	proc sql; drop table infopat, patnum; quit;

* get class of patnum;
	data cpc; set cow.cpc; *patgroup=substr(class,1,3); keep patnum patgroup; if cat="I"; 
	proc sort nodupkey; by patnum patgroup; run;
	proc sql; create table patnum_class as select a.patnum, a.patcite_num, a.pat_fyear, b.patgroup 
	from patcite_num_iyear as a left join cpc as b 
	on a.patnum=b.patnum; quit; 
	proc sql; drop table patcite_num_iyear; quit;

* count number of cites by group;
	data patnum_class; set patnum_class; drop patnum; count=1; if patgroup="" then patgroup="O"; run;
	proc sort data=patnum_class; by patcite_num pat_fyear patgroup; run;
	proc means noprint data=patnum_class; by patcite_num pat_fyear patgroup; var count;
	output out=patnum_count_bygroup(drop=_:) n=patnum_count; run;
	proc sql; drop table patnum_class; quit;

* calculate gener; 
	data patnum_count_bygroup; set patnum_count_bygroup; gener=patnum_count;	run;

* HHI;
	proc sort data = patnum_count_bygroup; by patcite_num pat_fyear; run;
	proc means data = patnum_count_bygroup noprint;	var gener; by patcite_num pat_fyear;
	output out = Herf_gener(drop=_:) sum(gener)=SUM_gener USS(gener)=SS_gener N=nfields_gener; run;
	data Herf_gener;	set Herf_gener;
	if SUM_gener > 0 then HHI_gener  = SS_gener  / (SUM_gener**2);
	drop SUM_gener SS_gener;
	label HHI_gener  = "HHI gener";
	run;

* adj genernality by N/(N-1) with N being the number of cites of the patent;
	proc sql; create table generality as select a.*, b.cites_raw, b.cites_raw5 
	from Herf_gener as a left join cites as b 
	on a.patcite_num=b.patcite_num; quit; 
	proc sort; by patcite_num; run;
	data generality; set generality;
	if cites_raw>2 then generality=(1-HHI_gener)*(cites_raw/(cites_raw-1)); else generality=1-HHI_gener;
	if cites_raw5>2 then generality5=(1-HHI_gener)*(cites_raw5/(cites_raw5-1)); else generality5=1-HHI_gener;
	run;


*** CALCULATE ORIGINALITY (One minus the HHI of the number of patents you cite across classes) ***; 
	
* get info for pat;
	data infopat; set cow.info; pat_iyear=year(idate); pat_fyear=year(fdate2); keep patnum pat_iyear pat_fyear; run;

* get filing year of patnum;
	data patnum; set cow.us_cites; keep patnum patcite_num; run;
	proc sql; create table patnum_iyear_temp as select a.patnum, a.patcite_num, b.pat_fyear 
	from patnum as a left join infopat as b 
	on a.patnum=b.patnum; quit; 
	proc sql; create table patnum_iyear as select *
	from patnum_iyear_temp as a left join citing as b 
	on a.patnum=b.patnum; quit; 
	proc sql; drop table infopat, patnum, patnum_iyear_temp; quit;

* get class of patcite_num;
	data cpc; set cow.cpc; *patgroup=substr(class,1,3); rename patnum=patcite_num; keep patnum patgroup; if cat="I"; 
	proc sort nodupkey; by patcite_num patgroup; run;
	proc sql; create table patcites_class as select a.patnum, a.patcite_num, a.pat_fyear, b.patgroup 
	from patnum_iyear as a left join cpc as b 
	on a.patcite_num=b.patcite_num; quit; 
	proc sql; drop table patnum_iyear; quit;
	
* count number of cites by group;
	data patcites_class; set patcites_class; drop patcite_num; count=1; if patgroup="" then patgroup="O"; run;
	proc sort data=patcites_class; by patnum pat_fyear patgroup; run;
	proc means noprint data=patcites_class; by patnum pat_fyear patgroup; var count;
	output out=patcite_count_bygroup(drop=_:) n=patcite_count; run;
	proc sql; drop table patcites_class; quit;

* calculate origi; 
	data patcite_count_bygroup; set patcite_count_bygroup; origi=patcite_count;	run;

* HHI;
	proc sort data = patcite_count_bygroup; by patnum pat_fyear; run;
	proc means data = patcite_count_bygroup noprint;	var origi; by patnum pat_fyear;
	output out = Herf_origi(drop=_:) sum(origi)=SUM_origi USS(origi)=SS_origi N=nfields_origi; run;
	data Herf_origi;	set Herf_origi;
	if SUM_origi > 0 then HHI_origi  = SS_origi  / (SUM_origi**2);
	drop SUM_origi SS_origi;
	label HHI_origi  = "HHI origi";
	run;

* adj Originality by N/(N-1) with N being the number of cites of the patent;
	proc sql; create table originality as select a.*, b.cites_raw, b.cites_raw5  
	from Herf_origi as a left join cites as b 
	on a.patnum=b.patcite_num; quit; 
	proc sort; by patnum; run;
	data originality; set originality;
	if cites_raw>2 then originality=(1-HHI_origi)*(cites_raw/(cites_raw-1)); else originality=1-HHI_origi;
	if cites_raw5>2 then originality5=(1-HHI_origi)*(cites_raw5/(cites_raw5-1)); else originality5=1-HHI_origi;
	run;

* adj Originality by N/(N-1) with N being the number of citing of the patent;
	proc sql; create table originality_alt as select a.*, b.citing_raw 
	from Herf_origi as a left join citing as b 
	on a.patnum=b.patnum; quit; 
	proc sort; by patnum; run;
	data originality_alt; set originality_alt;
	if citing_raw>2 then originality_alt=(1-HHI_origi)*(citing_raw/(citing_raw-1)); else originality_alt=1-HHI_origi;
	run;

* merge all; 
	data cites_raw; set cites_all; rename patcite_num=patnum; proc sort; by patnum; run;
	data generality; set generality; patnum=patcite_num; keep patnum generality generality5; proc sort; by patnum; run;
	data originality; set originality; keep patnum originality originality5; proc sort; by patnum; run;
	data originality_alt; set originality_alt; keep patnum originality_alt; proc sort; by patnum; run;
	data allcites; merge cites_raw generality originality originality_alt; by patnum; 
	if patnum~=.; run; 
	data cow.allcites; set allcites; run;

	proc sql; drop table cites_raw, allcites, herf_gener, herf_origi, generality, originality, originality_alt, Patcite_count_bygroup, Patnum_count_bygroup; quit;

	

******************************;
* INVENTOR DATA FROM PATVIEWS ;
******************************;

* import inventor info data available from https://www.mikewoeppel.com/data > Inventors link ;
	proc import out=inventor_5m (keep=patnum inventor_id country) FILE='C:\data\inventor_5m.csv' dbms=csv replace ; run;	
	proc import out=inventor_6m (keep=patnum inventor_id country) FILE='C:\data\inventor_6m.csv' dbms=csv replace ; run;	
	proc import out=inventor_7m (keep=patnum inventor_id country) FILE='C:\data\inventor_7m.csv' dbms=csv replace ; run;	
	proc import out=inventor_8m (keep=patnum inventor_id country) FILE='C:\data\inventor_8m.csv' dbms=csv replace ; run;	
	proc import out=inventor_9m (keep=patnum inventor_id country) FILE='C:\data\inventor_9m.csv' dbms=csv replace ; run;	
	proc import out=inventor_10m (keep=patnum inventor_id country) FILE='C:\data\inventor_10m.csv' dbms=csv replace ; run;	
	proc import out=inventor_11m (keep=patnum inventor_id country) FILE='C:\data\inventor_11m.csv' dbms=csv replace ; run;	
	data cow.inventor; set inventor_5m inventor_6m inventor_7m inventor_8m inventor_9m inventor_10m inventor_11m; run;
	proc sort data=cow.inventor; by patnum; run;
	proc sql; drop table inventor_5m, inventor_6m, inventor_7m, inventor_8m, inventor_9m, inventor_10m, inventor_11m; quit;




*******************;
* MERGING ALL DATA ;
*******************;

* merge inventor-company data;
	data patnum_final; set cow.patnum_final; 	proc sort; by patnum; run;
	data allcites; set cow.allcites; 	proc sort; by patnum; run;
	data all_uspto; merge cow.inventor patnum_final allcites; by patnum; pyear=year(fdate); uspto=1; if patnum~=.; run;

* get patent class for each firm as industry class;
	data firm; set all_uspto; keep patnum pyear permno; proc sort nodupkey; by patnum pyear permno; run;
	data cpc; set cow.cpc; patgroup=substr(class,1,1); keep patnum patgroup class; if cat="I"; 
	proc sort nodupkey; by patnum patgroup class; run;
	proc sql; create table firm_patgroup as select a.* , b.patgroup, b.class
	from firm as a left join cpc as b 
	on a.patnum=b.patnum; quit; 
	proc sort nodupkey; by permno patnum pyear patgroup class; run;

	data permno_pyear; set firm_patgroup; keep permno pyear; if permno~=.; proc sort nodupkey; by permno pyear; run;

	data pyear; set firm; keep pyear; if 1990<=pyear; proc sort nodupkey; by pyear; run;
	data permno; set firm; keep permno; if permno~=.; proc sort nodupkey; by permno; run;

	proc sql; create table joint as select * from permno as a, pyear as b; quit;
	
	proc sql; create table firm_patgroup5 as select a.* , b.patgroup, b.class, b.pyear as myear, b.patnum
	from joint as a left join firm_patgroup as b 
	on a.permno=b.permno and b.pyear-4<=a.pyear<=b.pyear; quit; 
	
	proc sort data=firm_patgroup5 ; by permno pyear patgroup descending myear patnum ; run;
	proc means data=firm_patgroup5 noprint; var patnum; by permno pyear patgroup;
	output out=permno_pyear_patgroup  n=; run;
	data permno_pyear_patgroup; set permno_pyear_patgroup; if patgroup="" then patgroup="O"; 
	keep permno pyear patgroup patnum; run;
	proc sort; by permno pyear patgroup descending patnum; run;
	data permno_patgroup_one; set permno_pyear_patgroup; by permno pyear; if first.permno or first.pyear; keep permno pyear patgroup; run;

	proc sort data=firm_patgroup5 ; by permno pyear class descending myear patnum ; run;
	proc means data=firm_patgroup5 noprint; var patnum; by permno pyear class;
	output out=permno_pyear_class  n=; run;
	data permno_pyear_class; set permno_pyear_class; if class="" then class="O"; 
	keep permno pyear class patnum; run;
	proc sort; by permno pyear class descending patnum; run;
	data permno_patgroup_three; set permno_pyear_class; by permno pyear; if first.permno or first.pyear; keep permno pyear class; run;

	data permno_ind; merge permno_patgroup_one permno_patgroup_three; by permno pyear; run;
	proc sql; drop table permno_patgroup_one, permno_patgroup_three, firm_patgroup5, permno_pyear_class, permno_pyear_patgroup; quit;

* count the number of patents and the number of cites (raw and adj) for each inventor to rank as superstars;
	data inv; set all_uspto; keep inventor_id uspto cites_raw cites_countclassadj cites_fclassadj cites_iclassadj cites_iyearadj cites_fyearadj; if inventor_id~=""; run;
	proc sql; drop table all_uspto; quit;

	proc sort data=inv; by inventor_id; run;
	proc means noprint data=inv; by inventor_id; 
	var uspto cites_raw cites_countclassadj cites_fclassadj cites_fyearadj; 
	output out=inventor_cites (drop=_:) sum= ; run;
	proc sql; drop table inv; quit;
	proc rank data=inventor_cites out=invrank_uspto groups=100; var uspto; ranks patcount_rank; run;
	proc rank data=inventor_cites out=invrank_cites_raw groups=100; var cites_raw; ranks cites_raw_rank; run;
	proc rank data=inventor_cites out=invrank_cites_fyearadj groups=100; var cites_fyearadj; ranks cites_fyearadj_rank; run;
	proc rank data=inventor_cites out=invrank_cites_fclassadj groups=100; var cites_fclassadj; ranks cites_fclassadj_rank; run;
	data inventor_cites_rank; merge invrank_uspto invrank_cites_raw invrank_cites_fyearadj  invrank_cites_fclassadj ; by inventor_id; run;
	proc sql; drop table invrank_uspto, invrank_cites_raw, invrank_cites_fyearadj, invrank_cites_fclassadj; quit;


***********************************************************************************;
* Find the first and last patent date for each inventor in each public firm in KPSS;
***********************************************************************************;

	data all_uspto; merge cow.inventor patnum_final allcites; by patnum; 
	pyear=year(fdate); uspto=1; if patnum~=.; if inventor_id~=""; run;

	data inventor_permno_year_first; set all_uspto; if inventor_id~="" and permno~=.; 
	rename fdate=firstfdate; keep inventor_id country permno fdate state statekpss incorpkpss double kpssrow patrow patorder;
	proc sort; by inventor_id permno firstfdate; run;
	data inventor_permno_year_first; set inventor_permno_year_first; by inventor_id permno; 
	if first.inventor_id or first.permno; run;
	data inventor_permno_year_last; set all_uspto; if inventor_id~="" and permno~=.; 
	rename fdate=lastfdate; keep inventor_id country permno fdate state statekpss incorpkpss double kpssrow patrow patorder;
	proc sort; by inventor_id permno descending lastfdate; run;
	data inventor_permno_year_last; set inventor_permno_year_last; by inventor_id permno;
	if first.inventor_id or first.permno; run;
	data inventor_permno_year; merge inventor_permno_year_first inventor_permno_year_last;  
	by inventor_id permno; run;
	proc sql; drop table inventor_permno_year_first, inventor_permno_year_last; quit;
	proc sort ; by inventor_id firstfdate; run;

* get the list of KPSS inventors;
	proc sort data=kpss_2020_public; by patnum; run;
	data all_kpss; merge cow.inventor kpss_2020_public; by patnum; pyear=year(fdate);  
	if inventor_id~="";
	run; 
	data kpss_inventors; set all_kpss; kpssinventor=1; keep inventor_id kpssinventor; 
	proc sort nodupkey; by inventor_id; run;
	proc sql; drop table all_kpss; quit;

* count the number of patent and cites in each year for each inventor in each company;
	data inventor_org_year_patcount; set all_uspto; if inventor_id~="";
	if cites_raw=. then cites_raw=0; 	if cites_adj=. then cites_adj=0; 
		drop first_name last_name male city latitude longitude state_fips county_fips location_id double; run;
	
	proc sql; drop table all_uspto; quit;

	data inventor_org_year_patcount; set  inventor_org_year_patcount;
		drop first_name last_name male city latitude longitude state_fips county_fips location_id double; run;

	proc sort data=inventor_org_year_patcount; by inventor_id permno pyear; run;
	proc means noprint data=inventor_org_year_patcount; by inventor_id permno pyear; 
	var cites_raw cites_countclassadj cites_fyearadj  cites_fclassadj  cites_fyearadjt  cites_fclassadjt
		cites_raw5 cites5_countclassadj cites5_fyearadj  cites5_fclassadj  cites5_fyearadjt  cites5_fclassadjt; 
	output out=inventor_org_year_sum (drop=_type_) 
	sum= patcitespyear patcitespyear_countclassadj patcitespyear_fyearadj  patcitespyear_fclassadj  patcitespyear_fyearadjt  patcitespyear_fclassadjt
		 patcites5pyear patcites5pyear_countclassadj patcites5pyear_fyearadj  patcites5pyear_fclassadj  patcites5pyear_fyearadjt  patcites5pyear_fclassadjt; 
	run;
	proc means noprint data=inventor_org_year_patcount; by inventor_id permno pyear; 
	var originality originality_alt generality originality5 generality5  ; 
	output out=inventor_org_year_avg (drop=_:) mean= ; run;
	data inventor_org_year_sumavg; merge inventor_org_year_sum inventor_org_year_avg; by inventor_id permno pyear;
	rename _freq_= patcountpyear; run;
	proc sql; drop table inventor_org_year_sum, inventor_org_year_avg, inventor_org_year_patcount; quit;

	proc sort data=inventor_org_year_sumavg; by pyear; run;
	proc means noprint data=inventor_org_year_sumavg; by pyear; var patcountpyear; 
	output out=patcountpyear_avg (drop=_:) mean=patcountpyear_avg; run;
	proc sql; create table inventor_org_year_count as select * 
	from inventor_org_year_sumavg as a left join patcountpyear_avg as b
	on a.pyear=b.pyear; quit;
	data inventor_org_year_count; set inventor_org_year_count; 
	patcountpyear_adj=patcountpyear/patcountpyear_avg; 
	patcountpyear_fclassadj=patcountpyear/mean_patcount_fyear_class;
	run;
	proc sort data=inventor_org_year_count; by inventor_id permno pyear; run;

	proc sql; drop table inventor_org_year_sumavg; quit;


* keep only the first patent in each year for each inventor in each company;
	data all_uspto; merge cow.inventor patnum_final allcites; by patnum; pyear=year(fdate); uspto=1; if patnum~=.; if inventor_id~=""; run;

	data inventor_org_year_first; set all_uspto; 
	keep inventor_id country permno pyear fdate patorder state statekpss incorpkpss double kpssrow patrow patorder; 
	rename fdate=firstfdate; rename patorder=firstpatorder; 
	run;
	proc sort; by inventor_id permno pyear firstfdate; run;
	data inventor_org_year_first; set inventor_org_year_first; by inventor_id permno pyear; 
	if first.inventor_id or first.permno or first.pyear; 
	run;

* keep only the last patent in each year for each inventor in each company;
	data inventor_org_year_last; set all_uspto; 
	keep inventor_id country permno pyear fdate patorder state statekpss incorpkpss double kpssrow patrow patorder; 
	rename fdate=lastfdate; drop patnum patrow cites double; 
	run;
	proc sort; by inventor_id permno pyear descending lastfdate; run;
	data inventor_org_year_last; set inventor_org_year_last; by inventor_id permno pyear;
	if first.inventor_id or first.permno or first.pyear; 
	run;

* merge first and last patent into one record in each year for each inventor in each company;
	data inventor_org_year; merge inventor_org_year_first inventor_org_year_last inventor_org_year_count;  
	by inventor_id permno pyear; inv_yr_row=_n_; run;
	proc sql; drop table inventor_org_year_first, inventor_org_year_last, inventor_org_year_count; quit;

	proc sort data=inventor_org_year; by descending inv_yr_row; run;
	data inventor_org_year; set inventor_org_year;
	lead_inventor_id=lag(inventor_id); run;

	proc sort data=inventor_org_year; by inventor_id pyear firstfdate permno ; run;
	data inventor_org_year; set inventor_org_year; drop inv_yr_row; run;
	data inventor_org_year; set inventor_org_year; inv_yr_row=_N_; run;
 
	proc sql; drop table all_uspto; quit;


**************************************;
* Identify movers and the moving year ;
**************************************;
	data temp; merge inventor_org_year kpss_inventors(in=in1); by inventor_id; if in1; run;
	proc sort data=temp; by inventor_id pyear firstfdate permno ; run;
	data temp; set temp; inventorlevelrow=_n_; run;
	proc sort data=temp; by descending inventorlevelrow; run;
	data temp; set temp; 
	lead_inventor_id=lag(inventor_id);
	lead_2inventor_id=lag2(inventor_id);
	lead_3inventor_id=lag3(inventor_id);
	lead_4inventor_id=lag4(inventor_id);
	lead_permno=lag(permno);
	lead_2permno=lag2(permno);
	lead_3permno=lag3(permno);
	lead_4permno=lag4(permno);
	run; 
	proc sort data=temp; by inventorlevelrow; run;
	data temp1; set temp; 
	if inventor_id~="" ; 
	lag_inventor_id=lag(inventor_id);
	lag_2inventor_id=lag2(inventor_id);
	lag_3inventor_id=lag3(inventor_id);
	lag_4inventor_id=lag4(inventor_id);
	lag_permno=lag(permno);
	lag_2permno=lag2(permno);
	lag_3permno=lag3(permno);
	lag_4permno=lag4(permno);
	lag_pyear=lag(pyear); 
	lag_lastfdate=lag(lastfdate); 
	lag_patorder=lag(patorder);

	if lag_inventor_id~=inventor_id and lead_inventor_id~=inventor_id then delete;* drop inventors who appear once;

	* delete wrong insert from USPTO;
	if lag_permno<1000000 and lag_inventor_id=inventor_id and lead_inventor_id=inventor_id and lag_permno~=permno and lag_permno=lead_permno and kpssrow=. then delete;

	if lag_permno<1000000 and lag_inventor_id=inventor_id and lag_permno~=permno and lag_permno=lead_2permno and kpssrow=. then delete;
	if lag_2permno<1000000 and lag_2inventor_id=inventor_id and lag_2permno~=permno and lag_2permno=lead_permno and kpssrow=. then delete;

	if lag_permno<1000000 and lag_inventor_id=inventor_id and lag_permno~=permno and lag_permno=lead_3permno and kpssrow=. then delete;
	if lag_2permno<1000000 and lag_2inventor_id=inventor_id and lag_2permno~=permno and lag_2permno=lead_2permno and kpssrow=. then delete;
	if lag_3permno<1000000 and lag_3inventor_id=inventor_id and lag_3permno~=permno and lag_3permno=lead_permno and kpssrow=. then delete;

	if lag_permno<1000000 and lag_inventor_id=inventor_id and lag_permno~=permno and lag_permno=lead_4permno and kpssrow=. then delete;
	if lag_2permno<1000000 and lag_2inventor_id=inventor_id and lag_2permno~=permno and lag_2permno=lead_3permno and kpssrow=. then delete;
	if lag_3permno<1000000 and lag_3inventor_id=inventor_id and lag_3permno~=permno and lag_3permno=lead_2permno and kpssrow=. then delete;
	if lag_4permno<1000000 and lag_4inventor_id=inventor_id and lag_4permno~=permno and lag_4permno=lead_permno and kpssrow=. then delete;

	drop lag_: lead_:;
	run;

	
  	data inventorlevel_0; set temp1; * assign USPTO state as incorp and hq states for private firms;
	if incorpkpss="" then incorpkpss=state;
	if statekpss="" then statekpss=state;
	drop state;
	run;

	data inventorlevel_0; set inventorlevel_0; 
	incorp=incorpkpss; drop incorpkpss;
	state=statekpss; drop statekpss;

	lag_inventor_id=lag(inventor_id);
	lag_permno=lag(permno);
	lag_pyear=lag(pyear); 
	lag_lastfdate=lag(lastfdate); 
	lag_patorder=lag(patorder);
	lag_incorp=lag(incorp);
	lag_state=lag(state);

	* assume moving time to be half way between the two data points;
	if lag_inventor_id=inventor_id and lag_permno~=permno then 
	do; 
		myear=pyear-floor((pyear-lag_pyear)/2); 
		mdate=firstfdate-floor((firstfdate-lag_lastfdate)/2); 
		format mdate date9.; 
	end;

	count=1;

	if myear~=. then move=1;
	if move=1 and permno>1000000 then move_topriv=1; else move_topriv=0; 
	if move=1 and lag_permno<1000000 and permno>1000000 then move_pubtopriv=1; else move_pubtopriv=0; 
	if move=1 and permno>1000000 and patorder<=1 then move_tostup=1; else move_tostup=0; 
	if move=1 and lag_permno<1000000 and permno>1000000 and patorder<=3 then move_pubtostup=1; else move_pubtostup=0; 

	prior_permno=lag_permno;
	if kpssrow~=. then kpss=1; else kpss=0;
	drop lead_: kpssinventor move ;
	run;



	data moveyear; set inventorlevel_0; if myear~=.;
	keep lag_permno lag_incorp lag_state lag_inventor_id myear move_: ; run; 
	data moveyear; set moveyear; move=1;
	rename lag_permno=permno; rename lag_incorp=incorp; 
	rename lag_state=state; rename lag_inventor_id=inventor_id; 
	rename myear=pyear; 
	run;
	data inventorlevel_0; set inventorlevel_0;
	move_topriv=0; move_pubtopriv=0; move_tostup=0; move_pubtostup=0; 
	run;

	data inventorlevel; set inventorlevel_0 moveyear; run;

	proc sql; drop table inventorlevel_0, temp1, temp; quit;

	proc sort data=inventorlevel; by inventor_id pyear firstfdate permno; run;

	data inventorlevel; set inventorlevel; row=_n_; run;
	proc sort data=inventorlevel; by descending row; run;
	data inventorlevel; set inventorlevel; 
	lag_move=lag(move); 
	lag_pyear=lag(pyear);
	if lag_move=1 and lag_pyear=pyear then delete; run;
	proc sort data=inventorlevel; by row; run;

	* identify the first and last year each inventor and each firm;
	data z; set inventorlevel; keep inventor_id permno pyear incorp state ; if move~=1; run;
	proc sort data=z; by inventor_id permno pyear; run;
	data fpyear; set z; by inventor_id permno; if first.inventor_id or first.permno; rename pyear=fpyear; run;
	proc sort data=z; by inventor_id permno descending pyear; run;
	data lpyear; set z; by inventor_id permno; if first.inventor_id or first.permno; rename pyear=lpyear; run;
	data flpyear; merge fpyear lpyear; by inventor_id permno; if permno~=.; run;
	
	data pyear; set z; keep pyear; if pyear~=.; proc sort nodupkey; by pyear; run;
	
	proc sql; create table joint as select * from flpyear as a, pyear as b
	where a.fpyear<=b.pyear<=a.lpyear; quit;

	data joint; set joint; drop fpyear lpyear; proc sort; by inventor_id pyear permno; run;
	data joint_extra; set joint moveyear; run;
	proc sort; by inventor_id pyear permno; run;

	* keep the patcount and cites for each inventor firm year;
	data invfirmyr; set inventorlevel; 
	keep inventor_id permno pyear 
	patcountpyear patcountpyear_adj patcountpyear_fclassadj 
	patcitespyear patcitespyear_countclassadj patcitespyear_fyearadj  patcitespyear_fclassadj patcitespyear_fyearadjt  patcitespyear_fclassadjt  
	patcites5pyear patcites5pyear_countclassadj patcites5pyear_fyearadj  patcites5pyear_fclassadj patcites5pyear_fyearadjt  patcites5pyear_fclassadjt
	generality originality originality_alt generality5 originality5  ; run;
	proc sort; by inventor_id pyear permno; run;

	data joint_extra_rank; merge joint_extra invfirmyr;  by inventor_id pyear permno; run;

	proc sql; drop table inventorlevel, z, Inventor_permno_year, Invfirmyr, joint, joint_extra; quit;
	
	data inventorjoint; merge joint_extra_rank Inventor_cites_rank;  by inventor_id; run;
	proc sql; drop table joint_extra_rank; quit;

	proc sql; create table inventorfinal as select * from inventorjoint as a left join permno_ind as b
	on a.permno=b.permno and a.pyear=b.pyear; quit;

	data inventorlevel; set inventorfinal; if 1996<=pyear<=2018; if incorp~=""; run;
	PROC EXPORT DATA= inventorlevel OUTFILE= "C:\cow\inventorlevel.dta" DBMS=STATA REPLACE;RUN;
	proc sql; drop table inventorlevel, temp; quit; 


**********************;
* Dataset of stayers *;
**********************;
	
	* identify the movers;
	data mover; set inventorfinal; if move=1; leaver=1; keep leaver inventor_id permno pyear; run;
	proc sort nodupkey; by inventor_id permno pyear; run;

	* identify the first and last year of inventors not in moving years;
	data z; set inventorfinal; keep inventor_id permno pyear incorp state ; if move~=1; run;
	proc sort data=z; by inventor_id permno pyear; run;
	data fpyear; set z; by inventor_id permno; if first.inventor_id or first.permno; rename pyear=fpyear; run;
	proc sort data=z; by inventor_id permno descending pyear; run;
	data lpyear; set z; by inventor_id permno; if first.inventor_id or first.permno; rename pyear=lpyear; run;
	data flpyear; merge fpyear lpyear; by inventor_id permno; if permno~=.; drop incorp state; run;

	* get information of the first and last year working by the movers before moving;
	proc sql; create table mover_flpyear as select * from mover as a left join flpyear as b
	on a.inventor_id=b.inventor_id and a.permno=b.permno and a.pyear>=b.lpyear; quit;

	* flag the leaving inventor in the inventor panel;
	data a; set inventorfinal; if 1996<=pyear<=2018; if incorp~=""; run; 
	proc sql; create table inventor_stayer_temp as select a.*, b.leaver from a as a left join mover_flpyear as b
	on a.inventor_id=b.inventor_id and a.permno=b.permno and b.fpyear<=a.pyear<=b.lpyear; quit;
	proc sql; drop table a; quit;
	proc sort data=inventor_stayer_temp;  by inventor_id pyear permno; run;
	proc sql; drop table z, lpyear, fpyear, flpyear; quit;

	data inventor_stayer; set inventor_stayer_temp; if move=1 or leaver=1 then leaving=1; else leaving=0;  
	if 1996<=pyear<=2018; if incorp~=""; run;
	PROC EXPORT DATA= inventor_stayer OUTFILE= "C:\cow\inventor_stayer.dta" DBMS=STATA REPLACE;RUN;
	proc sql; drop table inventor_stayer, inventor_stayer_temp; quit; 
	* in the stayer analysis use only leaving=0;


* calculate priv vs public firms;
	data a; set inventorfinal; if 1996<=pyear<=2018; 
	if incorp~="" and state~="" and incorp not in ("AS", "INTL", "PR", "TT", "VI");
	if permno>=1000000 then priv=1; else priv=0; keep permno priv;
	proc sort nodupkey; by permno priv; run;
	proc freq data=a; table priv; run;

* calculate % of move;
	data b; set inventorfinal; if 1996<=pyear<=2018; 
	if incorp~="" and state~="" and incorp not in ("AS", "INTL", "PR", "TT", "VI");
	keep inventor_id permno move;
	proc sort nodupkey; by inventor_id permno descending move; run;
	data b; set b; by inventor_id; if first.inventor_id or first.permno ; if move=. then move=0; run;
	proc freq; table move; run; 

